/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#ifdef ATOM
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(16 * 12)
#endif

#ifdef CORE2
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(16 * 12)
#endif

#if defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(16 * 12)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN)
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(16 * 12)
#endif

#ifdef PENTIUM4
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(16 * 20)
#endif

#ifdef OPTERON
#define PREFETCH	prefetch
#define PREFETCHW	prefetchw
#define PREFETCHSIZE	(16 * 8)
#define movsd		movlpd
#endif

#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH	prefetch
#define PREFETCHW	prefetchw
#define PREFETCHSIZE	(16 * 16)
#endif

#ifdef NANO
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(8 * 24)
#endif

#ifdef GENERIC
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(16 * 20)
#endif

#ifndef WINDOWS_ABI

#define STACKSIZE	80

#define OLD_Y		 8 + STACKSIZE(%rsp)
#define OLD_INCY	16 + STACKSIZE(%rsp)
#define OLD_BUFFER	24 + STACKSIZE(%rsp)

#define M	  ARG1
#define	N	  ARG2
#define	A	  ARG3
#define LDA	  ARG4
#define	X	  ARG5
#define INCX	  ARG6

#else

#define STACKSIZE	256

#define OLD_LDA		 40 + STACKSIZE(%rsp)
#define OLD_X		 48 + STACKSIZE(%rsp)
#define OLD_INCX	 56 + STACKSIZE(%rsp)
#define OLD_Y		 64 + STACKSIZE(%rsp)
#define OLD_INCY	 72 + STACKSIZE(%rsp)
#define OLD_BUFFER	 80 + STACKSIZE(%rsp)

#define M	  ARG1
#define N	  ARG2
#define	A	  ARG4
#define LDA	  ARG3
#define	X	  %rdi
#define INCX	  %rsi

#endif

#define	Y	%r10
#define INCY	%r11
#define BUFFER	%r12

#define TEMP	%rax
#define I	%rax
#define A1	%rbx
#define A2	%rbp
#define XX	%r13
#define YY	%r14
#define IS	%r15
#define NEW_X	BUFFER
#define NEW_Y	X

#define ALPHA  %xmm0

#define xtemp1 %xmm0
#define xtemp2 %xmm1
#define yy1    %xmm2
#define yy2    %xmm3

#define atemp1 %xmm4
#define atemp2 %xmm5
#define atemp3 %xmm6
#define atemp4 %xmm7

#define xsum1  %xmm8
#define xsum2  %xmm9
#define xsum3  %xmm10
#define xsum4  %xmm11

#define a1     %xmm12
#define a2     %xmm13
#define a3     %xmm14
#define	xt1    %xmm15

	PROLOGUE
	PROFCODE

	subq	$STACKSIZE, %rsp
	movq	%rbx,  0(%rsp)
	movq	%rbp,  8(%rsp)
	movq	%r12, 16(%rsp)
	movq	%r13, 24(%rsp)
	movq	%r14, 32(%rsp)
	movq	%r15, 40(%rsp)

#ifdef WINDOWS_ABI
	movq	%rdi,    48(%rsp)
	movq	%rsi,    56(%rsp)
	movups	%xmm6,   64(%rsp)
	movups	%xmm7,   80(%rsp)
	movups	%xmm8,   96(%rsp)
	movups	%xmm9,  112(%rsp)
	movups	%xmm10, 128(%rsp)
	movups	%xmm11, 144(%rsp)
	movups	%xmm12, 160(%rsp)
	movups	%xmm13, 176(%rsp)
	movups	%xmm14, 192(%rsp)
	movups	%xmm15, 208(%rsp)

	movq	OLD_LDA,   LDA
	movq	OLD_X,     X
	movq	OLD_INCX,  INCX

	movaps	%xmm2, %xmm0
#endif

	movq	OLD_Y,     Y
	movq	OLD_INCY,   INCY
	movq	OLD_BUFFER, BUFFER

	leaq	(,INCX, SIZE), INCX
	leaq	(,INCY, SIZE), INCY
	leaq	(,LDA,  SIZE), LDA

	testq	M, M
	jle	.L999

	unpcklpd ALPHA, ALPHA

	movq	BUFFER, XX

	movq	M,  %rax
	sarq	$3, %rax
	jle	.L02
	ALIGN_3

.L01:
	movsd	0 * SIZE(X), %xmm1
	addq	INCX, X
	movhpd	0 * SIZE(X), %xmm1
	addq	INCX, X
	movsd	0 * SIZE(X), %xmm2
	addq	INCX, X
	movhpd	0 * SIZE(X), %xmm2
	addq	INCX, X
	movsd	0 * SIZE(X), %xmm3
	addq	INCX, X
	movhpd	0 * SIZE(X), %xmm3
	addq	INCX, X
	movsd	0 * SIZE(X), %xmm4
	addq	INCX, X
	movhpd	0 * SIZE(X), %xmm4
	addq	INCX, X

	mulpd	ALPHA, %xmm1
	mulpd	ALPHA, %xmm2
	mulpd	ALPHA, %xmm3
	mulpd	ALPHA, %xmm4

	movapd	%xmm1, 0 * SIZE(XX)
	movapd	%xmm2, 2 * SIZE(XX)
	movapd	%xmm3, 4 * SIZE(XX)
	movapd	%xmm4, 6 * SIZE(XX)

	addq	$8 * SIZE, XX
	decq	%rax
	jg	.L01
	ALIGN_3

.L02:
	movq	M, %rax
	andq	$7, %rax
	jle	.L05
	ALIGN_3

.L03:
	movsd	0 * SIZE(X), %xmm1
	addq	INCX, X

	mulsd	ALPHA, %xmm1

	movlpd	%xmm1, 0 * SIZE(XX)

	addq	$1 * SIZE, XX
	decq	%rax
	jg	.L03
	ALIGN_3

.L05:
	/* now we don't need original X */
	movq   Y, NEW_Y

	addq   $512, XX
	andq   $-512, XX

	cmpq   $SIZE, INCY
	je    .L10

	movq   Y,  YY
	movq   XX, NEW_Y

	movq	M,  %rax
	sarq	$3, %rax
	jle	.L07
	ALIGN_3

.L06:
	movsd	0 * SIZE(YY), %xmm0
	addq	INCY, YY
	movhpd	0 * SIZE(YY), %xmm0
	addq	INCY, YY
	movsd	0 * SIZE(YY), %xmm1
	addq	INCY, YY
	movhpd	0 * SIZE(YY), %xmm1
	addq	INCY, YY
	movsd	0 * SIZE(YY), %xmm2
	addq	INCY, YY
	movhpd	0 * SIZE(YY), %xmm2
	addq	INCY, YY
	movsd	0 * SIZE(YY), %xmm3
	addq	INCY, YY
	movhpd	0 * SIZE(YY), %xmm3
	addq	INCY, YY

	movapd	%xmm0, 0 * SIZE(XX)
	movapd	%xmm1, 2 * SIZE(XX)
	movapd	%xmm2, 4 * SIZE(XX)
	movapd	%xmm3, 6 * SIZE(XX)

	addq	$8 * SIZE, XX
	decq	%rax
	jg	.L06
	ALIGN_3

.L07:
	movq	M, %rax
	andq	$7, %rax
	jle	.L10
	ALIGN_3

.L08:
	movsd	0 * SIZE(YY), %xmm0
	addq	INCY, YY

	movsd	%xmm0, 0 * SIZE(XX)

	addq	$1 * SIZE, XX
	decq	%rax
	jg	.L08
	ALIGN_3

.L10:
	xorq	IS, IS		# is = 0

	cmpq	$4, N
	jl	.L20
	ALIGN_3

.L11:
	movq	A,  A1
	leaq	(A, LDA, 2), A2
	leaq	4 * SIZE(A, LDA, 4), A

	leaq	        (NEW_X, IS, SIZE), XX
	leaq	4 * SIZE(NEW_Y, IS, SIZE), YY

	movapd		0 * SIZE(XX), atemp2
	movapd		2 * SIZE(XX), atemp4

	movsd	 0 * SIZE(A1), xsum1
	movhpd	 1 * SIZE(A1), xsum1
	mulpd	 atemp2, xsum1

	movsd	 1 * SIZE(A1), xsum2
	movhpd	 1 * SIZE(A1, LDA, 1), xsum2
	mulpd	 atemp2, xsum2

	movsd	 2 * SIZE(A1), xsum3
	movhpd	 2 * SIZE(A1, LDA, 1), xsum3
	mulpd	 atemp2, xsum3

	movsd	 3 * SIZE(A1), xsum4
	movhpd	 3 * SIZE(A1, LDA, 1), xsum4
	mulpd	 atemp2, xsum4

	movsd	 2 * SIZE(A1), a1
	movhpd	 3 * SIZE(A1), a1
	mulpd	 atemp4, a1
	addpd	 a1, xsum1

	movsd	 2 * SIZE(A1, LDA, 1), a1
	movhpd	 3 * SIZE(A1, LDA, 1), a1
	mulpd	 atemp4, a1
	addpd	 a1, xsum2

	movsd	 2 * SIZE(A2), a1
	movhpd	 3 * SIZE(A2), a1
	mulpd	 atemp4, a1
	addpd	 a1, xsum3

	movsd	 3 * SIZE(A2), a1
	movhpd	 3 * SIZE(A2, LDA, 1), a1
	mulpd	 atemp4, a1
	addpd	 a1, xsum4

	movapd	 4 * SIZE(XX), xtemp1
	movapd	 6 * SIZE(XX), xtemp2

	movsd	 4 * SIZE(A1), a1
	movhpd	 5 * SIZE(A1), a1
	movsd	 6 * SIZE(A1), a2
	movhpd	 7 * SIZE(A1), a2
	movsd	 4 * SIZE(A1, LDA, 1), a3
	movhpd	 5 * SIZE(A1, LDA, 1), a3

	movsd	 0 * SIZE(YY), yy1
	movhpd	 1 * SIZE(YY), yy1
	movsd	 2 * SIZE(YY), yy2
	movhpd	 3 * SIZE(YY), yy2

#ifndef HAVE_SSE3
	movapd	 atemp2, atemp1
	unpcklpd atemp1, atemp1
	unpckhpd atemp2, atemp2
	movapd	 atemp4, atemp3
	unpcklpd atemp3, atemp3
	unpckhpd atemp4, atemp4
#else
	movddup	 atemp2, atemp1
	unpckhpd atemp2, atemp2
	movddup	 atemp4, atemp3
	unpckhpd atemp4, atemp4
#endif

	addq	 $4 * SIZE, XX
	addq	 $4 * SIZE, A1
	addq	 $4 * SIZE, A2

	movq	M,  I
	subq	IS, I
	subq	$4, I
	sarq	$3, I
	jle	.L15
	ALIGN_3

.L12:
	movapd	 xtemp1, xt1
	mulpd	 a1,     xt1
	mulpd	 atemp1, a1
	addpd	 xt1,    xsum1
	addpd	 a1,     yy1
	movsd	 2 * SIZE(A1, LDA, 1), a1
	movhpd	 3 * SIZE(A1, LDA, 1), a1

	PREFETCH	PREFETCHSIZE(A1)

	movapd	 xtemp2, xt1
	mulpd	 a2,     xt1
	mulpd	 atemp1, a2
	addpd	 xt1,    xsum1
	addpd	 a2,     yy2
	movsd	 0 * SIZE(A2), a2
	movhpd	 1 * SIZE(A2), a2

	movapd	 xtemp1, xt1
	mulpd	 a3,     xt1
	mulpd	 atemp2, a3
	addpd	 xt1,    xsum2
	addpd	 a3,     yy1
	movsd	 2 * SIZE(A2), a3
	movhpd	 3 * SIZE(A2), a3

#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
	PREFETCH	PREFETCHSIZE(XX)
#endif

	movapd	 xtemp2, xt1
	mulpd	 a1,     xt1
	mulpd	 atemp2, a1
	addpd	 xt1,    xsum2
	addpd	 a1,     yy2
	movsd	 0 * SIZE(A2, LDA, 1), a1
	movhpd	 1 * SIZE(A2, LDA, 1), a1

	movapd	 xtemp1, xt1
	mulpd	 a2,     xt1
	mulpd	 atemp3, a2
	addpd	 xt1,    xsum3
	addpd	 a2,     yy1
	movsd	 2 * SIZE(A2, LDA, 1), a2
	movhpd	 3 * SIZE(A2, LDA, 1), a2

	PREFETCH	PREFETCHSIZE(A1, LDA, 1)

	movapd	 xtemp2, xt1
	mulpd	 a3,     xt1
	mulpd	 atemp3, a3
	addpd	 xt1,    xsum3
	addpd	 a3,     yy2
	movsd	 4 * SIZE(A1), a3
	movhpd	 5 * SIZE(A1), a3

	movapd	 xtemp1, xt1
	movapd	 4 * SIZE(XX), xtemp1
	mulpd	 a1,     xt1
	mulpd	 atemp4, a1
	addpd	 xt1,    xsum4
	addpd	 a1,     yy1
	movsd	 6 * SIZE(A1), a1
	movhpd	 7 * SIZE(A1), a1

	movapd	 xtemp2, xt1
	movapd	 6 * SIZE(XX), xtemp2
	mulpd	 a2,     xt1
	mulpd	 atemp4, a2
	addpd	 xt1,    xsum4
	addpd	 a2,     yy2
	movsd	 4 * SIZE(A1, LDA, 1), a2
	movhpd	 5 * SIZE(A1, LDA, 1), a2

	movsd	 yy1, 0 * SIZE(YY)
	movhpd	 yy1, 1 * SIZE(YY)
	movsd	 4 * SIZE(YY), yy1
	movhpd	 5 * SIZE(YY), yy1

	movsd	 yy2, 2 * SIZE(YY)
	movhpd	 yy2, 3 * SIZE(YY)
	movsd	 6 * SIZE(YY), yy2
	movhpd	 7 * SIZE(YY), yy2

	movapd	 xtemp1, xt1
	mulpd	 a3,     xt1
	mulpd	 atemp1, a3
	addpd	 xt1,    xsum1
	addpd	 a3,     yy1
	movsd	 6 * SIZE(A1, LDA, 1), a3
	movhpd	 7 * SIZE(A1, LDA, 1), a3

	PREFETCH	PREFETCHSIZE(A2)

	movapd	 xtemp2, xt1
	mulpd	 a1,     xt1
	mulpd	 atemp1, a1
	addpd	 xt1,    xsum1
	addpd	 a1,     yy2
	movsd	 4 * SIZE(A2), a1
	movhpd	 5 * SIZE(A2), a1

	movapd	 xtemp1, xt1
	mulpd	 a2,     xt1
	mulpd	 atemp2, a2
	addpd	 xt1,    xsum2
	addpd	 a2,     yy1
	movsd	 6 * SIZE(A2), a2
	movhpd	 7 * SIZE(A2), a2

#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
	PREFETCHW	PREFETCHSIZE(YY)
#endif

	movapd	 xtemp2, xt1
	mulpd	 a3,     xt1
	mulpd	 atemp2, a3
	addpd	 xt1,    xsum2
	addpd	 a3,     yy2
	movsd	 4 * SIZE(A2, LDA, 1), a3
	movhpd	 5 * SIZE(A2, LDA, 1), a3

	movapd	 xtemp1, xt1
	mulpd	 a1,     xt1
	mulpd	 atemp3, a1
	addpd	 xt1,    xsum3
	addpd	 a1,     yy1
	movsd	 6 * SIZE(A2, LDA, 1), a1
	movhpd	 7 * SIZE(A2, LDA, 1), a1

	PREFETCH	PREFETCHSIZE(A2, LDA, 1)

	movapd	 xtemp2, xt1
	mulpd	 a2,     xt1
	mulpd	 atemp3, a2
	addpd	 xt1,    xsum3
	addpd	 a2,     yy2
	movsd	10 * SIZE(A1), a2
	movhpd	11 * SIZE(A1), a2

	movapd	 xtemp1, xt1
	movapd	 8 * SIZE(XX), xtemp1
	mulpd	 a3,     xt1
	mulpd	 atemp4, a3
	addpd	 xt1,    xsum4
	addpd	 a3,     yy1
	movsd	 8 * SIZE(A1, LDA, 1), a3
	movhpd	 9 * SIZE(A1, LDA, 1), a3

	movapd	 xtemp2, xt1
	movapd	10 * SIZE(XX), xtemp2
	mulpd	 a1,     xt1
	mulpd	 atemp4, a1
	addpd	 xt1,    xsum4
	addpd	 a1,     yy2
	movsd	 8 * SIZE(A1), a1
	movhpd	 9 * SIZE(A1), a1

	movsd	 yy1, 4 * SIZE(YY)
	movhpd	 yy1, 5 * SIZE(YY)
	movsd	 8 * SIZE(YY), yy1
	movhpd	 9 * SIZE(YY), yy1

	movsd	 yy2, 6 * SIZE(YY)
	movhpd	 yy2, 7 * SIZE(YY)
	movsd	10 * SIZE(YY), yy2
	movhpd	11 * SIZE(YY), yy2

	addq	 $8 * SIZE, XX
	addq	 $8 * SIZE, YY
	addq	 $8 * SIZE, A1
	addq	 $8 * SIZE, A2

	decq	 I
	jg	 .L12
	ALIGN_3

.L15:
	movq	M,  I
	subq	IS, I
	subq	$4, I
	test	$4, I
	jle	.L17

	movapd	 xtemp1, xt1
	mulpd	 a1,     xt1
	mulpd	 atemp1, a1
	addpd	 xt1,    xsum1
	addpd	 a1,     yy1
	movsd	 2 * SIZE(A1, LDA, 1), a1
	movhpd	 3 * SIZE(A1, LDA, 1), a1

	movapd	 xtemp2, xt1
	mulpd	 a2,     xt1
	mulpd	 atemp1, a2
	addpd	 xt1,    xsum1
	addpd	 a2,     yy2
	movsd	 0 * SIZE(A2), a2
	movhpd	 1 * SIZE(A2), a2

	movapd	 xtemp1, xt1
	mulpd	 a3,     xt1
	mulpd	 atemp2, a3
	addpd	 xt1,    xsum2
	addpd	 a3,     yy1
	movsd	 2 * SIZE(A2), a3
	movhpd	 3 * SIZE(A2), a3

	movapd	 xtemp2, xt1
	mulpd	 a1,     xt1
	mulpd	 atemp2, a1
	addpd	 xt1,    xsum2
	addpd	 a1,     yy2
	movsd	 0 * SIZE(A2, LDA, 1), a1
	movhpd	 1 * SIZE(A2, LDA, 1), a1

	movapd	 xtemp1, xt1
	mulpd	 a2,     xt1
	mulpd	 atemp3, a2
	addpd	 xt1,    xsum3
	addpd	 a2,     yy1
	movsd	 2 * SIZE(A2, LDA, 1), a2
	movhpd	 3 * SIZE(A2, LDA, 1), a2

	movapd	 xtemp2, xt1
	mulpd	 a3,     xt1
	mulpd	 atemp3, a3
	addpd	 xt1,    xsum3
	addpd	 a3,     yy2
	movsd	 4 * SIZE(A1, LDA, 1), a3
	movhpd	 5 * SIZE(A1, LDA, 1), a3

	movapd	 xtemp1, xt1
	movapd	 4 * SIZE(XX), xtemp1
	mulpd	 a1,     xt1
	mulpd	 atemp4, a1
	addpd	 xt1,    xsum4
	addpd	 a1,     yy1
	movsd	 4 * SIZE(A1), a1
	movhpd	 5 * SIZE(A1), a1

	movapd	 xtemp2, xt1
	movapd	 6 * SIZE(XX), xtemp2
	mulpd	 a2,     xt1
	mulpd	 atemp4, a2
	addpd	 xt1,    xsum4
	addpd	 a2,     yy2
	movsd	 6 * SIZE(A1), a2
	movhpd	 7 * SIZE(A1), a2

	movsd	 yy1, 0 * SIZE(YY)
	movhpd	 yy1, 1 * SIZE(YY)
	movsd	 4 * SIZE(YY), yy1
	movhpd	 5 * SIZE(YY), yy1

	movsd	 yy2, 2 * SIZE(YY)
	movhpd	 yy2, 3 * SIZE(YY)
	movsd	 6 * SIZE(YY), yy2
	movhpd	 7 * SIZE(YY), yy2

	addq	 $4 * SIZE, XX
	addq	 $4 * SIZE, YY
	addq	 $4 * SIZE, A1
	addq	 $4 * SIZE, A2
	ALIGN_3

.L17:
	testq	$2, M
	jle	.L18

	movapd	 xtemp1, xt1
	mulpd	 a1,     xt1
	mulpd	 atemp1, a1
	addpd	 xt1,    xsum1
	addpd	 a1,     yy1
	movsd	 0 * SIZE(A1, LDA, 1), a1
	movhpd	 1 * SIZE(A1, LDA, 1), a1

	movapd	 xtemp1, xt1
	mulpd	 a1,     xt1
	mulpd	 atemp2, a1
	addpd	 xt1,    xsum2
	addpd	 a1,     yy1
	movsd	 0 * SIZE(A2), a1
	movhpd	 1 * SIZE(A2), a1

	movapd	 xtemp1, xt1
	mulpd	 a1,     xt1
	mulpd	 atemp3, a1
	addpd	 xt1,    xsum3
	addpd	 a1,     yy1
	movsd	 0 * SIZE(A2, LDA, 1), a1
	movhpd	 1 * SIZE(A2, LDA, 1), a1

	movapd	 xtemp1, xt1
	movapd	 2 * SIZE(XX), xtemp1
	mulpd	 a1,     xt1
	mulpd	 atemp4, a1
	addpd	 xt1,    xsum4
	addpd	 a1,     yy1
	movsd	 2 * SIZE(A1), a1

	movsd	 yy1, 0 * SIZE(YY)
	movhpd	 yy1, 1 * SIZE(YY)
	movsd	 2 * SIZE(YY), yy1

	addq	 $2 * SIZE, XX
	addq	 $2 * SIZE, YY
	addq	 $2 * SIZE, A1
	addq	 $2 * SIZE, A2
	ALIGN_3

.L18:
	testq	$1, M
	jle	.L19

	movapd	 xtemp1, xt1
	mulsd	 a1,     xt1
	mulsd	 atemp1, a1
	addsd	 xt1,    xsum1
	addpd	 a1,     yy1
	movsd	 0 * SIZE(A1, LDA, 1), a1

	movapd	 xtemp1, xt1
	mulsd	 a1,     xt1
	mulsd	 atemp2, a1
	addsd	 xt1,    xsum2
	addsd	 a1,     yy1
	movsd	 0 * SIZE(A2), a1

	movapd	 xtemp1, xt1
	mulsd	 a1,     xt1
	mulsd	 atemp3, a1
	addsd	 xt1,    xsum3
	addsd	 a1,     yy1
	movsd	 0 * SIZE(A2, LDA, 1), a1

	movapd	 xtemp1, xt1
	mulsd	 a1,     xt1
	mulsd	 atemp4, a1
	addsd	 xt1,    xsum4
	addsd	 a1,     yy1

	movsd	 yy1, 0 * SIZE(YY)
	ALIGN_3

.L19:
#ifndef HAVE_SSE3
	movapd	xsum1, atemp1
	movapd	xsum3, atemp3

	unpcklpd xsum2, xsum1
	unpcklpd xsum4, xsum3

	unpckhpd xsum2, atemp1
	unpckhpd xsum4, atemp3

	addpd	 atemp1, xsum1
	addpd	 atemp3, xsum3
#else
	haddpd	 xsum2, xsum1
	haddpd	 xsum4, xsum3
#endif

	movsd	 0 * SIZE(NEW_Y, IS, SIZE), yy1
	movhpd	 1 * SIZE(NEW_Y, IS, SIZE), yy1
	movsd	 2 * SIZE(NEW_Y, IS, SIZE), yy2
	movhpd	 3 * SIZE(NEW_Y, IS, SIZE), yy2

	addpd	 xsum1, yy1
	addpd	 xsum3, yy2

	movsd	 yy1, 0 * SIZE(NEW_Y, IS, SIZE)
	movhpd	 yy1, 1 * SIZE(NEW_Y, IS, SIZE)
	movsd	 yy2, 2 * SIZE(NEW_Y, IS, SIZE)
	movhpd	 yy2, 3 * SIZE(NEW_Y, IS, SIZE)

	addq	 $4, IS

	movq	 IS, I
	addq	 $4, I
	cmpq	 N, I
	jle	 .L11
	ALIGN_3

.L20:
	testq	$2, N
	jle	.L30

	movq	A,  A1
	leaq	2 * SIZE(A, LDA, 2), A

	movapd		0 * SIZE(NEW_X, IS, SIZE), atemp2

	movsd	 0 * SIZE(A1), xsum1
	movhpd	 1 * SIZE(A1), xsum1
	mulpd	 atemp2, xsum1

	movsd	 1 * SIZE(A1), xsum2
	movhpd	 1 * SIZE(A1, LDA, 1), xsum2
	mulpd	 atemp2, xsum2

#ifndef HAVE_SSE3
	movapd	 atemp2, atemp1
	unpcklpd atemp1, atemp1
#else
	movddup	 atemp2, atemp1
#endif
	unpckhpd atemp2, atemp2

	testq	$1, M
	jle	.L29

	movsd	 2 * SIZE(A1), a1
	movsd	 2 * SIZE(A1, LDA, 1), a2
	movsd	 2 * SIZE(NEW_X, IS, SIZE), xtemp1
	movsd	 2 * SIZE(NEW_Y, IS, SIZE), yy1

	movapd	 xtemp1, xt1
	mulsd	 a1,     xt1
	mulsd	 atemp1, a1
	addsd	 xt1,    xsum1
	addpd	 a1,     yy1

	movapd	 xtemp1, xt1
	mulsd	 a2,     xt1
	mulsd	 atemp2, a2
	addsd	 xt1,    xsum2
	addsd	 a2,     yy1

	movsd	 yy1, 2 * SIZE(NEW_Y, IS, SIZE)
	ALIGN_3

.L29:
#ifndef HAVE_SSE3
	movapd	xsum1, atemp1
	unpcklpd xsum2, xsum1
	unpckhpd xsum2, atemp1
	addpd	 atemp1, xsum1
#else
	haddpd	 xsum2, xsum1
#endif

	movsd	 0 * SIZE(NEW_Y, IS, SIZE), yy1
	movhpd	 1 * SIZE(NEW_Y, IS, SIZE), yy1

	addpd	 xsum1, yy1

	movsd	 yy1, 0 * SIZE(NEW_Y, IS, SIZE)
	movhpd	 yy1, 1 * SIZE(NEW_Y, IS, SIZE)

	addq	 $2, IS
	ALIGN_3

.L30:
	testq	$1, N
	jle	.L990

	movsd	 0 * SIZE(A), xsum1
	movsd	 0 * SIZE(NEW_X, IS, SIZE), atemp1
	movsd	 0 * SIZE(NEW_Y, IS, SIZE), yy1

	mulsd	 atemp1, xsum1
	addsd	 xsum1, yy1
	movsd	 yy1, 0 * SIZE(NEW_Y, IS, SIZE)
	ALIGN_3

.L990:
	cmpq   $SIZE, INCY
	je    .L999

	movq	M,  %rax
	sarq	$3, %rax
	jle	.L997
	ALIGN_3

.L996:
	movapd	 0 * SIZE(NEW_Y), %xmm0
	movapd	 2 * SIZE(NEW_Y), %xmm1
	movapd	 4 * SIZE(NEW_Y), %xmm2
	movapd	 6 * SIZE(NEW_Y), %xmm3

	movsd	%xmm0,  0 * SIZE(Y)
	addq	INCY, Y
	movhpd	%xmm0,  0 * SIZE(Y)
	addq	INCY, Y
	movsd	%xmm1,  0 * SIZE(Y)
	addq	INCY, Y
	movhpd	%xmm1,  0 * SIZE(Y)
	addq	INCY, Y
	movsd	%xmm2,  0 * SIZE(Y)
	addq	INCY, Y
	movhpd	%xmm2,  0 * SIZE(Y)
	addq	INCY, Y
	movsd	%xmm3,  0 * SIZE(Y)
	addq	INCY, Y
	movhpd	%xmm3,  0 * SIZE(Y)
	addq	INCY, Y

	addq	$8 * SIZE, NEW_Y
	decq	%rax
	jg	.L996
	ALIGN_3

.L997:
	movq	M, %rax
	andq	$7, %rax
	jle	.L999
	ALIGN_3

.L998:
	movsd	0 * SIZE(NEW_Y), %xmm0

	movsd	%xmm0,  0 * SIZE(Y)
	addq	INCY, Y

	addq	$1 * SIZE, NEW_Y

	decq	%rax
	jg	.L998
	ALIGN_3


.L999:
	movq	  0(%rsp), %rbx
	movq	  8(%rsp), %rbp
	movq	 16(%rsp), %r12
	movq	 24(%rsp), %r13
	movq	 32(%rsp), %r14
	movq	 40(%rsp), %r15

#ifdef WINDOWS_ABI
	movq	 48(%rsp), %rdi
	movq	 56(%rsp), %rsi
	movups	 64(%rsp), %xmm6
	movups	 80(%rsp), %xmm7
	movups	 96(%rsp), %xmm8
	movups	112(%rsp), %xmm9
	movups	128(%rsp), %xmm10
	movups	144(%rsp), %xmm11
	movups	160(%rsp), %xmm12
	movups	176(%rsp), %xmm13
	movups	192(%rsp), %xmm14
	movups	208(%rsp), %xmm15
#endif

	addq	$STACKSIZE, %rsp
	ret
	EPILOGUE
